library(tidyverse)
library(readr)
library(purrr)
library(plotly)
city_state.homicide = read_csv("homicide-data.csv") |>
janitor::clean_names() |>
dplyr::mutate(
city_state = paste(city,",",state),
city_state = str_replace(city_state,"Tulsa , AL", "Tulsa , OK"))
## Rows: 52179 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): uid, victim_last, victim_first, victim_race, victim_age, victim_sex...
## dbl (3): reported_date, lat, lon
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
the raw data have 52179 records and13 variables. Column names are uid, reported_date, victim_last, victim_first, victim_race, victim_age, victim_sex, city, state, lat, lon, disposition, city_state.The data included the location of the killing, whether an arrest was made and, in most cases, basic demographic information about each victim.
I summarized within cities to obtain the total number of homicides and the number of unsolved homicides (those for which the disposition is “Closed without arrest” or “Open/No arrest”).
For the city of Baltimore, MD, use the prop.test function to estimate the proportion of homicides that are unsolved; save the output of prop.test as an R object, apply the broom::tidy to this object and pull the estimated proportion and confidence intervals from the resulting tidy dataframe.
p = homicide |>
group_by(city_state) |># there's no missing in city or state variables
summarise(total_count = n(),
unsolved_count = sum(disposition %in% c("Closed without arrest","Open/No arrest"))) |>
filter(city_state == "Baltimore , MD")
prop.test(pull(p,unsolved_count), pull(p,total_count),conf.level = .95) |>
broom::tidy() |>
select(estimate,conf.low, conf.high)
## # A tibble: 1 × 3
## estimate conf.low conf.high
## <dbl> <dbl> <dbl>
## 1 0.646 0.628 0.663
Now run prop.test for each of the cities in your dataset, and extract both the proportion of unsolved homicides and the confidence interval for each. Do this within a “tidy” pipeline, making use of purrr::map, purrr::map2, list columns and unnest as necessary to create a tidy dataframe with estimated proportions and CIs for each city.
I had a function calculating prop and its 95%CI.
prop = function(df){
pp = prop.test(pull(df,unsolved_count), pull(df,total_count),conf.level = .95) |>
broom::tidy() |>
select(estimate,conf.low, conf.high)
}
prop(p)#check if the function works
data_city = homicide |>
group_by(city_state) |># there's no missing in city or state variables
summarise(total_count = n(),
unsolved_count = sum(disposition %in% c("Closed without arrest","Open/No arrest")))
result <- data_city |>
group_by(city_state) |>
nest() |>
mutate(proportion_test = map(data, prop)) |>
select(proportion_test) |>
unnest(proportion_test)
## Adding missing grouping variables: `city_state`
#do I need to combine the two locations in Tulsa
Create a plot that shows the estimates and CIs for each city – check out geom_errorbar for a way to add error bars based on the upper and lower limits. Organize cities according to the proportion of unsolved homicides.
result |>
mutate(city_state = fct_reorder(city_state,estimate)) |>
ggplot(aes(x = city_state, y = estimate, color = city_state)) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))
longitudinal = function(path,filename) {
df =
read_csv(path) |>
janitor::clean_names() |>
mutate(id = filename) |>
pivot_longer(
cols = -id,
names_to = "week",
values_to = "value",
names_prefix = "week_") |>
separate(id,c("arm","subject_id"),sep="_") |>
mutate(
arm = recode(
arm,
con = "control",
exp = "experimental"),
subject_id = gsub("\\.csv$","",subject_id))
df
}
filelst = list.files("./data",full.names = TRUE)
q3_tidy =
purrr::map(filelst, ~ longitudinal(.x, basename(.x))) |>
bind_rows()
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q3_tidy |>
group_by(week) |>
summarise(n = n())
## # A tibble: 8 × 2
## week n
## <chr> <int>
## 1 1 20
## 2 2 20
## 3 3 20
## 4 4 20
## 5 5 20
## 6 6 20
## 7 7 20
## 8 8 20
q3_tidy |>
group_by(subject_id) |>
summarise(n = n())
## # A tibble: 10 × 2
## subject_id n
## <chr> <int>
## 1 01 16
## 2 02 16
## 3 03 16
## 4 04 16
## 5 05 16
## 6 06 16
## 7 07 16
## 8 08 16
## 9 09 16
## 10 10 16
q3_tidy |>
group_by(week, subject_id) |>
summarise(n = n())
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.
## # A tibble: 80 × 3
## # Groups: week [8]
## week subject_id n
## <chr> <chr> <int>
## 1 1 01 2
## 2 1 02 2
## 3 1 03 2
## 4 1 04 2
## 5 1 05 2
## 6 1 06 2
## 7 1 07 2
## 8 1 08 2
## 9 1 09 2
## 10 1 10 2
## # ℹ 70 more rows
Make a spaghetti plot showing observations on each subject over time, and comment on differences between groups
q3_tidy |>
ggplot(aes(x = week, y = value, color = subject_id)) +
geom_line(aes(group = subject_id),se = FALSE)+
facet_grid(~arm)
## Warning in geom_line(aes(group = subject_id), se = FALSE): Ignoring unknown
## parameters: `se`
Participants in control arm didn’t show significant change in value over the 8 weeks of study, whereas participants in experimental arm saw a considerable increase in value over the 8 weeks.
set.seed(1)
p3 = function(mu,n = 30,sigma = 5){#mu must be in the first place?
sim_data = tibble(
x = rnorm(n, mean = mu, sd = sigma),
)
sim_data |>
t.test() |> broom::tidy() |>
select(estimate, p.value) |>
janitor::clean_names()
}
results_df =
map(1:5000, \(i) p3(mu = 0)) |> bind_rows()
sim_results_df =
expand_grid(
mu = 1:6,
iter = 1:5000) |>
mutate(
estimate_df = map(mu,p3)) |>
unnest(estimate_df)
Make a plot showing the proportion of times the null was rejected (the power of the test) on the y axis and the true value of μ on the x axis. Describe the association between effect size and power.
plot_df = sim_results_df |>
mutate(
conclusion = ifelse(p_value < 0.05, "reject","not reject")) |>
group_by(mu) |>
summarise(total_count = n(),
reject = sum(conclusion == "reject")) |>
mutate(prop = reject/total_count)
power = plot_df |>
plot_ly(x = ~mu, y = ~prop, type = 'scatter', mode = "line") |>
layout(title = 'Association between effect size and power', plot_bgcolor = "#e5ecf6",
xaxis = list(title = "True mu",
tickvals = seq(1, 6, by = 1),
ticktext = seq(1, 6, by = 1)),
yaxis = list(title = "Power"))
power
Make a plot showing the average estimate of μ^ on the y axis and the true value of μ on the x axis.
estimate_all = sim_results_df |>
plot_ly(x = ~mu, y = ~estimate, type = 'box', name = "all") |>
layout(title = 'Association between true mu and its estimate', plot_bgcolor = "#e5ecf6",
xaxis = list(title = "True mu",
tickvals = seq(1, 6, by = 1),
ticktext = seq(1, 6, by = 1)),
yaxis = list(title = "Estimate"))
estimate_all
Make a second plot (or overlay on the first) the average estimate of μ^ only in samples for which the null was rejected on the y axis and the true value of μ on the x axis. Is the sample average of μ^ across tests for which the null is rejected approximately equal to the true value of μ ? Why or why not?
estimate_rej = sim_results_df |>
filter(p_value<0.05)
compare = add_trace(estimate_all, x = ~pull(estimate_rej,mu), y = ~pull(estimate_rej,estimate), type = 'box', name = "reject") |> layout(legend=list(title=list(text='Data Range')))
compare